from unittest import result
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.style.use({'figure.figsize':(25,20)})

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
#设置绘图大小
data1=pd.read_csv('已知价格数据.csv', sep=',')##已知数据
data2=pd.read_csv('估价数据.csv', sep=',')##要进行预测的数据
x_train=data1.drop(["price"],axis=1)
y_train=data1["price"]
#导入随机森林模块
from sklearn.ensemble import RandomForestRegressor
#导入网络搜索交叉验证，网络搜索可以让模型参数按照我们给定的列表遍历，找到效果最好的模型
#交叉验证可以告诉我们模型的准确性
from sklearn.model_selection import GridSearchCV
#构造参数字典，让这三个参数按照列表给定的顺序排列组合遍历一遍
param_grid={
    'n_estimators':[5,10,20,50,100,200],#决策树的个数
    'max_depth':[3,5,7,15],#最大树深，树太深会造成过拟合
    'max_features':[0.6,0.7,0.8,1]#决策树划分时考虑的最大特征数
}

rf=RandomForestRegressor()
grid=GridSearchCV(rf,param_grid=param_grid,cv=3)
train_x,test_x,train_y,test_y=train_test_split(x_train,y_train,test_size=0.3,random_state=22)
x_test=data2
grid.fit(train_x,train_y)
rf_reg=grid.best_estimator_#选用最好的模型
print(rf_reg)
rf_reg.predict(x_test)
feature_names=x_train.columns
feature_importances=rf_reg.feature_importances_
indices=np.argsort(feature_importances)
print('特征排序：')
for index in indices:
    print('feature %s (%f)' %(feature_names[index],feature_importances[index]))
plt.figure(figsize=(7,5))
plt.title('随机森林模型中不同特征的重要程度')
plt.bar(range(len(feature_importances)),feature_importances[indices],color='b')
plt.xticks(range(len(feature_importances)),np.array(feature_names)[indices],color='b')
plt.show()
score = rf_reg.score(test_x,test_y)
print(score)#输出模型得分
submission={"price":rf_reg.predict(x_test)}##进行预测价格
submission=pd.DataFrame(submission)
submission.to_csv('结果.csv')